sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 14393)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.15.1 DT_0.2 plotly_4.5.6
## [4] readr_1.1.0 png_0.1-7 gridExtra_2.2.1
## [7] choroplethrMaps_1.0.1 choroplethr_3.6.0 acs_2.0
## [10] XML_3.98-1.6 plyr_1.8.4 stringr_1.2.0
## [13] data.world_0.1.2 shinydashboard_0.5.3 shiny_1.0.1
## [16] tidyr_0.6.1 dplyr_0.5.0 ggplot2_2.2.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.2.1 maps_3.1.1 viridisLite_0.2.0
## [4] jsonlite_1.4 splines_3.3.2 Formula_1.2-1
## [7] assertthat_0.1 sp_1.2-4 latticeExtra_0.6-28
## [10] yaml_2.1.14 backports_1.0.5 lattice_0.20-34
## [13] uuid_0.1-2 digest_0.6.12 RColorBrewer_1.1-2
## [16] checkmate_1.8.2 colorspace_1.3-2 htmltools_0.3.5
## [19] httpuv_1.3.3 Matrix_1.2-7.1 WDI_2.4
## [22] purrr_0.2.2 xtable_1.8-2 scales_0.4.1
## [25] jpeg_0.1-8 tigris_0.3.3 ggmap_2.6.1
## [28] htmlTable_1.9 tibble_1.3.0 nnet_7.3-12
## [31] lazyeval_0.2.0 proto_1.0.0 survival_2.41-3
## [34] magrittr_1.5 mime_0.5 maptools_0.9-2
## [37] evaluate_0.10 foreign_0.8-67 tools_3.3.2
## [40] data.table_1.10.4 hms_0.3 geosphere_1.5-5
## [43] RgoogleMaps_1.4.1 munsell_0.4.3 cluster_2.0.5
## [46] grid_3.3.2 RCurl_1.95-4.8 rjson_0.2.15
## [49] rappdirs_0.3.1 htmlwidgets_0.8 bitops_1.0-6
## [52] base64enc_0.1-3 rmarkdown_1.4 gtable_0.2.0
## [55] DBI_0.6-1 reshape2_1.4.2 R6_2.2.0
## [58] rgdal_1.2-7 rgeos_0.3-23 Hmisc_4.0-2
## [61] rprojroot_1.2 stringi_1.1.5 Rcpp_0.12.10
## [64] mapproj_1.2-4 rpart_4.1-10 acepack_1.4.1
Note : set the working directory to the 00 Doc Folder.
Original data sources: http://www.governing.com/gov-data/internet-usage-by-state.html https://www.ntia.doc.gov/data/digital-nation-data-explorer#sel=tvBoxUser&disp=map
To download the clean data (what was used in Tableau) download CleanedInternetUsageBySatet.csv from the data.world link. https://data.world/lowatt/s-17-dv-project-6 and download ASM_2015_31AS101_with_ann.csv.
This data describes Internet Usage and Connectivity across the United States from July 2015 to October 2016. The columns that we used were a combination of the census population data,income data, internet usage and connectivity data.
First, we brought the US Census data into our R environment.
df1 <- data.world::query(connection = conn,
type = "sql",
dataset = "thule179/s-17-dv-final-project",
"select
A.AreaName,
A.`B01001_001`,
A.`B01001_002`,
A.`B01001_003`,
A.`B01001_004`,
A.`B01001_005`,
A.`B01001_006`,
A.`B01001_007`,
A.`B01001_008`,
A.`B01001_009`,
A.`B01001_010`,
A.`B01001_011`,
A.`B01001_012`,
A.`B01001_013`,
A.`B01001_014`,
A.`B01001_015`,
A.`B01001_016`,
A.`B01001_017`,
A.`B01001_018`,
A.`B01001_019`,
A.`B01001_020`,
A.`B01001_021`,
A.`B01001_022`,
A.`B01001_023`,
A.`B01001_024`,
A.`B01001_025`,
A.`B01001_026`,
A.`B01001_027`,
A.`B01001_028`,
A.`B01001_029`,
A.`B01001_030`,
A.`B01001_031`,
A.`B01001_032`,
A.`B01001_033`,
A.`B01001_034`,
A.`B01001_035`,
A.`B01001_036`,
A.`B01001_037`,
A.`B01001_038`,
A.`B01001_039`,
A.`B01001_040`,
A.`B01001_041`,
A.`B01001_042`,
A.`B01001_043`,
A.`B01001_044`,
A.`B01001_045`,
A.`B01001_046`,
A.`B01001_047`,
A.`B01001_048`,
A.`B01001_049`,
B.`B19062_001`,
B.`B19001_002`,
B.`B19001_003`,
B.`B19001_004`,
B.`B19001_005`,
B.`B19001_006`,
B.`B19001_007`,
B.`B19001_008`,
B.`B19001_009`,
B.`B19001_010`,
B.`B19001_011`,
B.`B19001_012`,
B.`B19001_013`,
B.`B19001_014`,
B.`B19001_015`,
B.`B19001_016`,
B.`B19001_017`
from `uscensusbureau`.`acs-2015-5-e-agesex`.`USA_All_States` as A,
`uscensusbureau`.`acs-2015-5-e-income`.`USA_All_States` as B
where A.AreaName = B.AreaName"
)
We then took the data table related to Facebook traffic penetration.
df1Second <- data.world::query(connection = conn,
type = "sql",
dataset = "thule179/s-17-dv-final-project",
"select
C.`Population (2010 Est.)`,
C.`Population % of USA`,
C.`Internet users June, 2010`,
C.`Internet Penetration`,
C.`Facebook users August, 2010`,
C.`Facebook Penetration`
from Facebook as C")
Lastly, we brought in data points for internet usage and connectivity.
df_InternetConnectivity <- data.world::query(connection = conn, type = "sql",
dataset = "thule179/s-17-dv-final-project",
"SELECT c.State as State, `Internet Connectivity`.`No connection anywhere (%)` as NoConnectionAnywhere, `Internet Connectivity`.`No home connection, but connect elsewhere (%)`
as NoHomeConnection_ConnectElseWhere, `Internet Connectivity`.`Connect at home only (%)` as ConnectAtHomeOnly,
InternetUsageAtHome.`Internet Usage At Home` as InternetUsageAtHome, InternetUsageAtWork.`Internet Usage At Work` as InternetUsageAtWork,
InternetUsageAtCoffeeShops.InternetUsageAtCoffeeShops as InternetUsageAtCoffeeShops,
InternetUsage.InternetUsage as InternetUsage
FROM `Internet Connectivity.xlsx/Internet Connectivity`as c, `Internet Connectivity.xlsx/InternetUsageAtHome` as h,
InternetUsageAtCoffeeShops as s, InternetUsageAtWork as w, InternetUsage as i
where c.State = h.State and s.State = h.State and h.State = w.State and w.State = i.State")
To reduce the number of columns in our main table and to make the data easier for analysis, we categorized the US population into age and income groups, separated by gender.
Put males into age categories.
male0to9 <- df1$B01001_003 + df1$B01001_004
male10to19 <- df1$B01001_005 + df1$B01001_006 + df1$B01001_007
male20to29 <- df1$B01001_008 + df1$B01001_009 + df1$B01001_010 + df1$B01001_011
male30to39 <- df1$B01001_012 + df1$B01001_013
male40to49 <- df1$B01001_014 + df1$B01001_015
male50to59 <- df1$B01001_016 + df1$B01001_017
male60to69 <- df1$B01001_018 + df1$B01001_019 + df1$B01001_020 + df1$B01001_021
male70to79 <- df1$B01001_022 + df1$B01001_023
male80andUp <- df1$B01001_024 + df1$B01001_025
Put females into age categories.
female0to9 <- df1$B01001_027 + df1$B01001_028
female10to19 <- df1$B01001_029 + df1$B01001_030 + df1$B01001_031
female20to29 <- df1$B01001_032 + df1$B01001_033 + df1$B01001_034 + df1$B01001_035
female30to39 <- df1$B01001_036 + df1$B01001_037
female40to49 <- df1$B01001_038 + df1$B01001_039
female50to59 <- df1$B01001_040 + df1$B01001_041
female60to69 <- df1$B01001_042 + df1$B01001_043 + df1$B01001_044 + df1$B01001_045
female70to79 <- df1$B01001_046 + df1$B01001_047
female80andUp <- df1$B01001_048 + df1$B01001_049
Group income into categories.
Agg_Income <- df1$B19062_001
TenToThirtyK <- df1$B19001_002 + df1$B19001_003 + df1$B19001_004 + df1$B19001_005 + df1$B19001_006
ThirtyToFiftyK <- df1$B19001_007 + df1$B19001_008 + df1$B19001_009 + df1$B19001_010
FiftyToHundredK <- df1$B19001_011 + df1$B19001_012 + df1$B19001_013
HundredToHundredFiftyK <- df1$B19001_014 + df1$B19001_015
HundredFiftyPlus <- df1$B19001_016 + df1$B19001_017
Bind all the categories into a data frame.
df2 <- as.data.frame(cbind(State = df1$AreaName, TotalPopulation = df1$B01001_001, male0to9, male10to19, male20to29, male30to39, male40to49, male50to59, male60to69, male70to79, male80andUp, female0to9, female10to19, female20to29, female30to39, female40to49, female50to59, female60to69, female70to79, female80andUp, Agg_Income,TenToThirtyK, ThirtyToFiftyK, FiftyToHundredK, HundredToHundredFiftyK, HundredFiftyPlus), stringsAsFactors = FALSE)
Change the numeric columns to numeric.
df2[-1] <- as.data.frame(apply(df2[-1], 2, as.numeric))
df2 <- cbind(df2, df1Second)
Do an inner join by State of df2 and InternetConnectivity.
df2 <- merge(df2,df_InternetConnectivity, by ="State")
Now we have our main data table, we’d need to clean up our data to prepare for analysis.
Get average income for each state.
df2$PerCapitaIncome <- df2$Agg_Income / df2$TotalPopulation
Remove ‘’ line breaks and commas in rows to convert InternetUsage into numbers.
df2[,'InternetUsage'] <- gsub(",","",df2[,'InternetUsage'])
df2[,'InternetUsage'] <- gsub("\n","",df2[,'InternetUsage'])
df2[,'InternetUsageAtCoffeeShops'] <- gsub(",","",df2[,'InternetUsageAtCoffeeShops'])
df2[,'InternetUsageAtCoffeeShops'] <- gsub("\n","",df2[,'InternetUsageAtCoffeeShops'])
df2[,'InternetUsageAtWork'] <- gsub(",","",df2[,'InternetUsageAtWork'])
df2[,'InternetUsageAtWork'] <- gsub("\n","",df2[,'InternetUsageAtWork'])
df2[,'InternetUsageAtHome'] <- gsub(",","",df2[,'InternetUsageAtHome'])
df2[,'InternetUsageAtHome'] <- gsub("\n","",df2[,'InternetUsageAtHome'])
df2$InternetUsage <- as.numeric(df2$InternetUsage)
df2$InternetUsageAtCoffeeShops <- as.numeric(df2$InternetUsageAtCoffeeShops)
df2$InternetUsageAtWork <- as.numeric(df2$InternetUsageAtWork)
df2$InternetUsageAtHome <- as.numeric(df2$InternetUsageAtHome)
Get average internet usage per person for each State.
df2$Avg_InternetUsage <- df2$InternetUsage
df2$Avg_InternetUsage <- df2$InternetUsage / df2$TotalPopulation
Get average internet usage at work.
df2$Avg_InternetUsageAtWork <- df2$InternetUsageAtWork
df2$Avg_InternetUsageAtWork <- df2$InternetUsageAtWork / df2$TotalPopulation
Get average internet usage at home.
df2$Avg_InternetUsageAtHome <- df2$InternetUsageAtHome
df2$Avg_InternetUsageAtHome <- df2$InternetUsageAtHome / df2$TotalPopulation
Get average internet usage at coffee shops.
df2$Avg_InternetUsageAtCoffeeShops <- df2$InternetUsageAtCoffeeShops
df2$Avg_InternetUsageAtCoffeeShops <- df2$InternetUsageAtCoffeeShops / df2$TotalPopulation
Find high, medium, and low ranges of Internet Usage.
sorted_df2 <- df2[order(df2$Avg_InternetUsage),] # sort df2 from lowest to highest based on avg_internet usage
low_range <- c(sorted_df2$Avg_InternetUsage[c(0: (51/3))]) # subset states with low Internet Usage
medium_range <- c(sorted_df2$Avg_InternetUsage[c((51/3): (2*51/3))]) # subset states with medium Internet Usage
high_range <- c(sorted_df2$Avg_InternetUsage[c((2*51/3) : 51)]) # subset states with high Internet Usage
# Find high, medium, and low ranges of Internert usage at work
sorted_df2_work <- df2[order(df2$Avg_InternetUsageAtWork ),]
low_range_work <- c(sorted_df2_work$Avg_InternetUsageAtWork[c(0: (51/3))])
medium_range_work <- c(sorted_df2_work$Avg_InternetUsageAtWork[c((51/3): (2*51/3))])
high_range_work <- c(sorted_df2_work$Avg_InternetUsageAtWork[c((2*51/3) : 51)])
# Find high, medium, and low ranges of Internert usage at home
sorted_df2_home <- df2[order(df2$Avg_InternetUsageAtHome ),]
low_range_home <- c(sorted_df2_home$Avg_InternetUsageAtHome[c(0: (51/3))])
medium_range_home <- c(sorted_df2_home$Avg_InternetUsageAtHome[c((51/3): (2*51/3))])
high_range_home <- c(sorted_df2_home$Avg_InternetUsageAtHome[c((2*51/3) : 51)])
# Find high, medium, and low ranges of Internert usage at coffee shops
sorted_df2_coffee <- df2[order(df2$Avg_InternetUsageAtCoffeeShops),]
low_range_coffee <- c(sorted_df2_coffee$Avg_InternetUsageAtCoffeeShops[c(0: (51/3))])
medium_range_coffee <- c(sorted_df2_coffee$Avg_InternetUsageAtCoffeeShops[c((51/3): (2*51/3))])
high_range_coffee <- c(sorted_df2_coffee$Avg_InternetUsageAtCoffeeShops[c((2*51/3) : 51)])
# Create column for Internet Usage Levels
df2$InternetUsageLevel <- df2$InternetUsage
df2$InternetUsageAtHomeLevel <- df2$InternetUsageAtHome
df2$InternetUsageAtWorkLevel <- df2$InternetUsageAtWork
df2$InternetUsageAtCoffeeShopsLevel<- df2$InternetUsageAtCoffeeShops
Create a function to categorize internet usage level.
UsageLevel <- function(col, low_range, medium_range, high_range ){
for (i in 1:nrow(df2)){
if(col[i] %in% low_range){
col[i] = "Low"
}
if(col[i] %in% medium_range){
col[i] = "Medium"
}
if(col[i] %in% high_range){
col[i] = "High"
}
}
result <- col
return(result)
}
Get internet usage level by category.
df2$InternetUsageLevel <- UsageLevel(df2$Avg_InternetUsage,low_range,medium_range,high_range)
df2$InternetUsageAtHomeLevel <- UsageLevel(df2$Avg_InternetUsageAtHome, low_range_home, medium_range_home, high_range_home)
df2$InternetUsageAtWorkLevel <- UsageLevel(df2$Avg_InternetUsageAtWork, low_range_work, medium_range_work, high_range_work)
df2$InternetUsageAtCoffeeShopsLevel <- UsageLevel(df2$Avg_InternetUsageAtCoffeeShops, low_range_coffee, medium_range_coffee, high_range_coffee)
Add percent of total population for each age range.
df2$YoungProportion <- (df2$male0to9 + df2$male10to19 + df2$male20to29 + df2$female0to9 + df2$female10to19 + df2$female20to29) / df2$TotalPopulation
df2$MiddleProportion <- (df2$male30to39 + df2$male40to49 + df2$male50to59 + df2$female30to39 + df2$female40to49 + df2$female50to59) / df2$TotalPopulation
df2$OldProportion <- 1 - df2$YoungProportion - df2$MiddleProportion
Create discrete categories for Young Proportion.
third = quantile(df2$YoungProportion, 1/3)
two_third = quantile(df2$YoungProportion, 2/3)
df2$YoungCategories <- if_else(df2$YoungProportion < third, "Low", if_else(df2$YoungProportion < two_third, "Medium", "High"))
Export to csv.
write.csv(df2,file="./CleanedInternetUsageByState.csv")
Now we have a nicely formatted csv file ready for analysis!
In order to further analyse these results, we can use the census data to view the proportion of young people for the Top 3 and Bottom 3 states by internet usage. Note that the top two states by internet usage (New Hampshire, Vermont) have much lower proportions of young people than the bottom states (Alabama, Mississippi, Tennessee).
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors